clear all
set more off , permanently
capture log close
set scheme lean2

***********************************************************************************************
********** This file prepares data for SDID analysis: MTU test phase not dropped **************
***********************************************************************************************

******************************************************
******************* Load data ************************
******************************************************

// Load German data: baseline 1 day carryforward of prices //

	* Load price panel
		cd "$dta"
		use "price_panel.dta" , clear
		
	* Only keep panel prices
		keep id_data date source dow weekend *_panel
		
	** Merge with daily weighted average and average prices
		cd "$dta"
		joinby id_data date source using "price_panel_wprice", unmatched(master)
		cap drop _merge 

	* Only keep weekdays and restrict analysis to September 2012 - December 2014
		drop if weekend == 1
		gen year = year(date)
		gen month = month(date)
		drop if year == 2015

	* Save temp
		cd "$dta2"
		compress
		save interm_panel.dta , replace

	* Save a list with relevant dates for which we need refinery prices
		keep date
		gen pepe = 1
		collapse (mean) pepe , by(date)
		sort date
		drop pepe
		keep if date >= d(01mar2013) & date <= d(31dec2014)
		save date_list.dta , replace
		
	* Prepare crude oil price 
		use "$dta\rotterdam_price.dta" , clear
		replace p_rotterdam = p_rotterdam / 100
		
		* Interpolate crude oil price on days where we have fuel price, but not crude oil price (Rule: If price missing, usually because of holiday, use price of day before.)
			merge 1:1 date using date_list.dta
			drop _merge
			sort date
			replace p_rotterdam = p_rotterdam[_n-1] if p_rotterdam == .
		label var p_rotterdam "Net average Rotterdam price in Euro per litre"
		
		save rotterdam_prices.dta , replace
		
		erase date_list.dta
		

	* Load station data
		cd "$dta"
		use stations.dta , clear

	keep id_data state latitude longitude										

	* Merge price panel and station data
		merge 1:m id_data using interm_panel.dta												
		drop if _merge == 1 | _merge == 2
		drop _merge
		erase interm_panel.dta

	* Merge with cost and tax data: petrol
		merge m:1 year month using "$dta\input_cost_petrol.dta"
		drop if _merge == 2
		drop _merge
		
	* Merge with cost and tax data: diesel
		merge m:1 year month using "$dta\input_cost_diesel.dta"
		drop if _merge == 2
		drop _merge	

	* Input crude oil price data
		merge m:1 date using "$dta\rotterdam_prices.dta"
		assert _merge == 3 if date >= d(01mar2013) & date <= d(31dec2014)
		drop if _merge ==2
		drop _merge

	* Reformat input cost and fuel tax
		replace input_cost = input_cost / 100
		replace fuel_tax = fuel_tax / 100
		replace input_cost_diesel = input_cost_diesel / 100
		replace fuel_tax_diesel = fuel_tax_diesel / 100
		label var input_cost "Average monthly cost of petrol to fuel stations in € per litre"
		label var fuel_tax "Lump sum petrol tax in € per litre"
		label var ebv "Lump sum petrol storage levy in € per litre"
		label var input_cost_diesel "Average monthly cost of diesel to fuel stations in € per litre"
		label var fuel_tax_diesel "Lump sum diesel tax in € per litre"
		label var ebv_diesel "Lump sum diesel storage levy in € per litre"
		label var year "Year"
		label var month "Month"

	* Specify panel structure
		encode id_data , g(station_id)
		xtset station_id date

	* Create data set only with prices at 5pm
		keep station_id state date source e5_17oclock_panel gazole_17oclock_panel wavg_price_e5 wavg_price_gazole  p_rotterdam ebv ebv_diesel fuel_tax fuel_tax_diesel year latitude longitude id_data
		
		* Create retail margins for 17 o'clock
			gen e5_margin_17oclock = (e5_17oclock_panel/1.19 - fuel_tax - ebv - p_rotterdam)*100
			gen e5_netp_17oclock = (e5_17oclock_panel/1.19 - fuel_tax - ebv)
			gen gazole_margin_17oclock = (gazole_17oclock_panel/1.19 - fuel_tax_diesel - ebv_diesel - p_rotterdam)*100 // for diesel
			gen gazole_netp_17oclock = (gazole_17oclock_panel/1.19 - fuel_tax_diesel - ebv_diesel) // for diesel
		
		* Label margins
			label var e5_margin_17oclock "Retail margin (Eurocent/litre) at 17 o'clock, petrol"
			label var gazole_margin_17oclock "Retail margin (Eurocent/litre) at 17 o'clock, diesel"
			label var wavg_price_e5 "Daily weighted avg. price (Eurocent/litre), E5"
			label var wavg_price_gazole "Daily weighted avg. price (Eurocent/litre), diesel"
			
		* Generate country
		gen country = "de"
		
		* Keep only relevant time frame
			drop if year != 2013 & year != 2014
		
		* Keep relevant variables
			drop source fuel_tax ebv fuel_tax_diesel ebv_diesel year p_rotterdam
			
		* Decode variables
			*decode state , g(state1)
			*drop state
			*rename state1 state
			drop station_id
			rename id_data station_id
						
		cd "$dta"
		save germany_prices_5pm.dta , replace


// Load French data //

	* Get data
		cd "$dta"
		use "france_margins_5pm.dta" , clear
	
	* Keep relevant variables
		keep date state gross_p_france margin_france net_p_france gross_p_france_diesel margin_france_diesel net_p_france_diesel id_pdv lat lon

	* Fix latitude and longitude variables
		foreach i in lat lon {
			tostring `i', gen(`i'_str) force
			replace `i'_str = subinstr(`i'_str,"-.","-0.",.)
			split `i'_str, parse(".")
	
			destring `i'_str1, gen(`i'1) force
			replace `i'1 = `i'1 / 100000
		}
		rename lat1 latitude
		rename lon1 longitude
		drop lat lon *_str*

	* Drop Corsica
		drop if state == "Corse"
		
	* Rename variables
		rename id_pdv station_id
		rename gross_p_france e5_17oclock_panel
		rename margin_france e5_margin_17oclock
		rename net_p_france e5_netp_17oclock
		rename gross_p_france_diesel gazole_17oclock_panel
		rename margin_france_diesel gazole_margin_17oclock
		rename net_p_france_diesel gazole_netp_17oclock
		
	* Convert margins in Eurocent
		replace e5_margin_17oclock = e5_margin_17oclock*100
		replace gazole_margin_17oclock = gazole_margin_17oclock*100

	* Generate country
		gen country = "fr"
		
	* String station_id
		tostring station_id , replace
			
	** For July 2023 robustness check - MPD effect using daily weighted avg. price in Germany (keep 5pm price in France): 
		gen wavg_price_e5 = e5_17oclock_panel
		gen wavg_price_gazole = gazole_17oclock_panel
		
	* Append with German data
		append using germany_prices_5pm.dta, force
	
		label var country "Country"		

	// Create control groups using local monopolists within 1km, 3km and 5km: done for stations in Germany only
		rename station_id id_data
		cd "$dta"
		merge m:1 id_data using "control_group_mtu_intro_1km_market.dta"
		gen treated_1km_market = 1
		replace treated_1km_market = 0 if _merge == 3
		drop if _merge == 2
		drop _merge
		
		cd "$dta"
		merge m:1 id_data using "control_group_mtu_intro_3km_market.dta"
		gen treated_3km_market = 1
		replace treated_3km_market = 0 if _merge == 3
		drop if _merge == 2
		drop _merge
		
		cd "$dta"
		merge m:1 id_data using "control_group_mtu_intro_5km_market.dta"
		gen treated_5km_market = 1
		replace treated_5km_market = 0 if _merge == 3
		drop if _merge == 2
		drop _merge
		
		rename id_data station_id

	* Add Rotterdam prices
		merge m:1 date using "rotterdam_prices.dta"

	* Tab date if no Rotterdam price --> Missing on 1 Jan 2013 only, as no carry-forward possible from 2012 (no oil price data for 2012 used)
		tab date if _merge == 1
		drop _merge
		
	* Save data set
		save germany_france.dta , replace
		
	* Erase intermediate dtas
		erase rotterdam_prices.dta 
